/*
Import Data From Cortellis on Approved Drugs
*/


clear
import excel using "Approved Drugs Summary.xls", first sh("Results")

// generate some indicators

save approved_drugs_all, replace



clear
gen indication = ""
save gennme_lists_by_indication, replace



foreach indic in "Hypertension" "Non-insulin dependent diabetes" "Chronic obstructive pulmonary disease" "HIV" "Hypercholesterolemia" {

	clear
	use approved_drugs_all
	
	keep if strpos(ActiveIndications, "`indic'") > 0
	
		// do some data processing
	keep DrugName OtherDrugNames Technologies ActiveIndications
	gen indication = "`indic'"
	
	// standard cleaning of drug names
	gen paren = strpos(DrugName, "(")
	replace DrugName = substr(DrugName, 1, paren - 1) if paren > 0
	gen comma = strpos(DrugName, ",")
	replace DrugName = substr(DrugName, 1, comma - 1) if comma > 0
	
	replace DrugName = subinstr(DrugName, " + ", "/", .)
	replace DrugName = trim(DrugName)
	replace DrugName = upper(DrugName)
	compress
	
	drop paren comma
	
	rename DrugName gennme
	
	
	append using gennme_lists_by_indication
	save gennme_lists_by_indication, replace

}


clear
use gennme_lists_by_indication

gen tot_indications = length(trim(ActiveIndications))-length(trim(subinstr(ActiveIndications,";","",.)))
drop if tot_indications > 5 // multi-purpose drugs


// do some cleaning for each area
gen aerosol = strpos(Technologies, "Inhalant") > 0
drop if indication == "Chronic obstructive pulmonary disease" & aerosol == 0


save gennme_lists_by_indication, replace





// expand to include brand name
clear
use gennme_lists_by_indication


split OtherDrugNames, p("; ")
rename OtherDrugNames ref

gen id = _n
reshape long OtherDrugNames, i(id) j(name_num)

// clean OtherDrugNames
rename OtherDrugNames prodnme
replace prodnme = trim(prodnme)
drop if prodnme == ""


// standard cleaning of drug names
gen paren = strpos(prodnme, "(")
replace prodnme = substr(prodnme, 1, paren - 1) if paren > 0
gen comma = strpos(prodnme, ",")
replace prodnme = substr(prodnme, 1, comma - 1) if comma > 0

replace prodnme = subinstr(prodnme, " + ", "/", .)
replace prodnme = trim(prodnme)
replace prodnme = upper(prodnme)

keep prodnme indication ref
egen k = tag(prodnme indication)
keep if k == 1
drop k

// more filtering
drop if prodnme == "" // some HIV drug nicknames start with parentheses
duplicates tag prodnme, gen(t)
tab t
drop if t>0
drop t

drop ref
compress
save prodnme_lists_by_indication, replace



// simplify resource
clear
use gennme_lists_by_indication

keep gennme indication
duplicates drop

// same two cases are problematic
duplicates tag gennme, gen(t)
 tab t
 drop if t>0
 drop t

save gennme_lists_by_indication, replace


